R Markdown

#Loading Packages
#Will most likely add more
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(bulletxtrctr)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
library(x3ptools)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(readr)
library(furrr)
## Loading required package: future
library(stringr)
library(dichromat)
library(future)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
library(tidyr)
options(future.globals.maxSize = 4*1024*1024*1024)
#data_dir <- "/media/Raven/REU_Refit"
#load("Data_NeededV2.RData") #Load for all results faster
# Reading in Hamby_173
df <- tibble(path = list.files(path = "Hamby_173", 
                               pattern = ".x3p", recursive = T, 
                               full.names = T)) %>% 
  mutate(Barrel = str_extract(path, "(Unknown|Barrel)\\d{0,2}") %>% 
           str_remove("Barrel"), 
         Bullet = str_extract(path, "Bullet[12A-Z]") %>% 
           str_remove("Bullet"),
         Land = str_extract(path, "land\\d{1}") %>% 
           str_remove("land")) %>% 
  mutate(Set = "Hamby173") %>%
  mutate(x3p = future_map(path, read_x3p)) %>%
 
  mutate(x3p = future_map(x3p, ~x3p_m_to_mum(.) %>% y_flip_x3p()))

# Reading in Hamby_252
df2 <- tibble(path = list.files(path = "Hamby_252", 
                                pattern = ".x3p", recursive = T, 
                                full.names = T)) %>% 
  mutate(Barrel = str_extract(path, "(Unknown|Barrel)\\d{0,2}") %>% 
           str_remove("Barrel"), 
         Bullet = str_extract(path, "Bullet[12A-Z]") %>% 
           str_remove("Bullet"), 
         Land = str_extract(path, "Bullet [12A-Z]-[123456]") %>% 
           str_remove("Bullet [12A-Z]-")) %>% 
  mutate(Set = "Hamby252") %>%
  mutate(x3p = future_map(path,read_x3p))  %>%
  # Adjust orientation
  mutate(x3p = future_map(x3p, ~x3p_m_to_mum(.) %>% rotate_x3p(angle = -90) %>% y_flip_x3p()))
# One big data set - easier to debug the code if you do everything in one big go.
hamby <- bind_rows(df, df2) 
hamby <- hamby %>%
  mutate(id = paste(Set, Barrel, Bullet, Land, sep = "-")) %>%
  select(id, Set, Barrel, Bullet, Land, x3p, path)
rm(df, df2)

Cross Sections look good.

plan(multicore) # use all the cores at once
hamby <- hamby %>%
  mutate(
    CrossSection = future_map_dbl(x3p, x3p_crosscut_optimize, minccf = 0.9, span = 0.3, percent_missing = 25))

head(select(hamby, -path, -x3p), 5)
## # A tibble: 5 x 6
##   id              Set      Barrel Bullet Land  CrossSection
##   <chr>           <chr>    <chr>  <chr>  <chr>        <dbl>
## 1 Hamby173-10-1-1 Hamby173 10     1      1               50
## 2 Hamby173-10-1-2 Hamby173 10     1      2               50
## 3 Hamby173-10-1-3 Hamby173 10     1      3               50
## 4 Hamby173-10-1-4 Hamby173 10     1      4               50
## 5 Hamby173-10-1-5 Hamby173 10     1      5               50
#hamby %>% arrange(desc(CrossSection))

plot1 = hamby %>% 
  filter(Barrel != "Unknown") %>% 
    ggplot(aes(x = Barrel, y = CrossSection, fill = Bullet))+ 
      geom_boxplot()+
          ggtitle("Barrels 1-10")

plot2 = hamby %>% 
  filter(Barrel == "Unknown") %>% 
    ggplot(aes(x = Barrel, y = CrossSection, fill = Bullet))+ 
      geom_boxplot()+
          ggtitle("Barrel Unknown")

grid.arrange(plot1, plot2)

Looking at the Boxplot for Barrels 1-10, we can see the inter-quartile ranges seem to be roughly the same for each Bullet. For Barrels 1-10, Bullet 2 seems to have higher values than Bullet 1. Barrels 6 and 7 have the largest whiskers. Barrel 1, Bullet 1 has the smallest inter-quartile range and Barrel 9, Bullet 2 has the largest interquartile range. There is one apparent outlier for Barrel 10, Bullet 1. This CrossSection Value is not high enough for concern when looking at all Barrels but should be examined separately.

Looking at the Boxplot for Barrel “Unknown” we can see an apparent outlier for Bullet B. This CrossSection value is 400. Unlike with Barrels 1-10 we can see the Bullets for Barrel “Unknown” have noticeably more lower whiskers.

#Cross Sections
hamby <- hamby %>% 
  mutate(CrossCut = future_map2(.x = x3p, .y = CrossSection, .f = x3p_crosscut))

crosscuts <- select(hamby, -path, -x3p) %>% 
      tidyr::unnest(CrossCut)

crosscuts %>% 
  arrange(desc(CrossSection))
## # A tibble: 636,728 x 9
##    id            Set     Barrel Bullet Land  CrossSection     x     y value
##    <chr>         <chr>   <chr>  <chr>  <chr>        <dbl> <dbl> <dbl> <dbl>
##  1 Hamby252-Unk… Hamby2… Unkno… B      2              400  0      400  61.5
##  2 Hamby252-Unk… Hamby2… Unkno… B      2              400  1.56   400  61.5
##  3 Hamby252-Unk… Hamby2… Unkno… B      2              400  3.12   400  61.7
##  4 Hamby252-Unk… Hamby2… Unkno… B      2              400  4.69   400  61.9
##  5 Hamby252-Unk… Hamby2… Unkno… B      2              400  6.25   400  61.9
##  6 Hamby252-Unk… Hamby2… Unkno… B      2              400  7.81   400  62.2
##  7 Hamby252-Unk… Hamby2… Unkno… B      2              400  9.38   400  62.2
##  8 Hamby252-Unk… Hamby2… Unkno… B      2              400 10.9    400  62.3
##  9 Hamby252-Unk… Hamby2… Unkno… B      2              400 12.5    400  62.7
## 10 Hamby252-Unk… Hamby2… Unkno… B      2              400 14.1    400  63.0
## # … with 636,718 more rows

Note: For Hamby173 barrels 1, 2, and 4 for bullet 1 seem to have higher crosscut values plotted. For Hamby173 barrel 1, bullet 2 seems to have a higher crosscut values plotted. There doesn’t seem to be any noticeably low values for Hamby173.

Note: For Hamby 252 barrel 10 and 2, bullet 1 has high crosscut values plotted. There also seems to be some low crosscut values plotted as well.

#Looking at CrossCuts to see any issues

crosscuts %>%
  filter(Barrel != "Unknown" & Set == "Hamby173") %>% 
    ggplot(data = ., aes(x = x, y = value, color = Land)) + 
      geom_line() + 
        facet_grid(Set + paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel))

crosscuts %>%
  filter(Barrel != "Unknown" & Set == "Hamby252") %>% 
    ggplot(data = ., aes(x = x, y = value, color = Land)) + 
      geom_line() + 
        facet_grid(Set + paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel))

crosscuts %>% 
  filter(Set == "Hamby173" & Barrel == "Unknown") %>%
    ggplot(aes(x = x, y = value)) + 
      geom_line() +
        facet_grid(Bullet~Land, labeller="label_both") +
          theme_bw()+ ggtitle("Barrel Unknown for Hamby173")

crosscuts %>% 
  filter(Set == "Hamby252" & Barrel == "Unknown") %>%
    ggplot(aes(x = x, y = value)) + 
      geom_line() +
        facet_grid(Bullet~Land, labeller="label_both") +
          theme_bw()+ ggtitle("Barrel Unknown for Hamby252")

crosscuts %>% filter(id == "Hamby252-Unknown-B-2") %>% 
  ggplot(aes(x = x, y = value))+
    geom_line()+
      facet_grid(Bullet~Land + Set, labeller = "label_both")+
        theme_bw() + ggtitle("High CrossSection Bullet B")

Looking at the CrossCuts we can see the side profile for a bullets land impression for every barrel in Hamby173 and Hamby252. For most of the CrossCuts the shoulders can be clearly identified but there are some that can not be “clearly” identified due to possbile tank rash, pitting, or scans that do not meet the standards that forensic analysis require. I can not say for certain if there are scans that should be excluded. I used FIX3P to examine the “groove to groove” scans and noticed considerable tank rash on some of the scans. There was some noticeable pitting as well but I only felt it was a concern when the pitting was concentrated in the center of the “groove to groove” scan.

When looking at the Crosscuts we can see most have noticeable curvature. There are some scans that do not and these scans can look like a flat line or even a traingle… I Have made note of these particular scans as well.

Groove Loader

#Grooves
saved_grooves_location <- "V2H173_H252_Grooves_data.rda"
if (file.exists(saved_grooves_location)) {
  hamby$Grooves <- readRDS(saved_grooves_location)
} else {
  hamby <- hamby %>% 
    mutate(Grooves = CrossCut %>% 
             future_map(.f = cc_locate_grooves, 
                        method = "rollapply", smoothfactor = 15, return_plot = T))  # use plot so that the shiny app works...
}

grooves <- hamby %>% tidyr::unnest(Grooves)

head(grooves, 10)
## # A tibble: 10 x 8
##    id       Set    Barrel Bullet Land  path            CrossSection Grooves
##    <chr>    <chr>  <chr>  <chr>  <chr> <chr>                  <dbl> <list> 
##  1 Hamby17… Hamby… 10     1      1     Hamby_173/Barr…           50 <dbl […
##  2 Hamby17… Hamby… 10     1      1     Hamby_173/Barr…           50 <S3: g…
##  3 Hamby17… Hamby… 10     1      2     Hamby_173/Barr…           50 <dbl […
##  4 Hamby17… Hamby… 10     1      2     Hamby_173/Barr…           50 <S3: g…
##  5 Hamby17… Hamby… 10     1      3     Hamby_173/Barr…           50 <dbl […
##  6 Hamby17… Hamby… 10     1      3     Hamby_173/Barr…           50 <S3: g…
##  7 Hamby17… Hamby… 10     1      4     Hamby_173/Barr…           50 <dbl […
##  8 Hamby17… Hamby… 10     1      4     Hamby_173/Barr…           50 <S3: g…
##  9 Hamby17… Hamby… 10     1      5     Hamby_173/Barr…           50 <dbl […
## 10 Hamby17… Hamby… 10     1      5     Hamby_173/Barr…           50 <S3: g…
head(select(hamby, -path, -x3p, -CrossCut), 5)
## # A tibble: 5 x 7
##   id              Set      Barrel Bullet Land  CrossSection Grooves   
##   <chr>           <chr>    <chr>  <chr>  <chr>        <dbl> <list>    
## 1 Hamby173-10-1-1 Hamby173 10     1      1               50 <list [2]>
## 2 Hamby173-10-1-2 Hamby173 10     1      2               50 <list [2]>
## 3 Hamby173-10-1-3 Hamby173 10     1      3               50 <list [2]>
## 4 Hamby173-10-1-4 Hamby173 10     1      4               50 <list [2]>
## 5 Hamby173-10-1-5 Hamby173 10     1      5               50 <list [2]>
plan(sequential) # stop furrr multicore processes - messes with shiny/rmd

Examining Groove Cut Offs

Hamby_test <- hamby %>% 
  filter(Set == "Hamby252") %>% 
    filter(Barrel == 6)

Hamby_test2 <- hamby %>% 
  filter(Set == "Hamby252") %>% 
    filter(Barrel == 3 & Bullet == 1)

Hamby_test3 <- hamby %>% 
  filter(Set == "Hamby252") %>% 
    filter(Barrel == 1 & Bullet == 1)

Hamby_test4 <- hamby %>% 
  filter(Set == "Hamby252") %>% 
    filter(Barrel == 9 & Bullet == 2)

Hamby_test5 <- hamby %>% 
  filter(Set == "Hamby252") %>% 
    filter(Barrel == "Unknown") %>% 
      filter(Bullet == "B" | Bullet == "S" | Bullet == "U")

Hamby_test6 <- hamby %>% 
  filter(Set == "Hamby173") %>% 
    filter(Barrel == 3 & Bullet == 2)

Hamby_test7 <- hamby %>% 
  filter(Set == "Hamby173") %>% 
    filter(Barrel == "Unknown") %>% 
      filter(Bullet == "B" | Bullet == "E" | Bullet == "U")


gridExtra::grid.arrange(Hamby_test$Grooves[[1]]$plot,
                        Hamby_test$Grooves[[7]]$plot,
                         Hamby_test2$Grooves[[4]]$plot,
                         Hamby_test3$Grooves[[6]]$plot,
                         Hamby_test4$Grooves[[4]]$plot,
                         Hamby_test5$Grooves[[2]]$plot,
                         Hamby_test5$Grooves[[10]]$plot,
                        nrow = 2)

gridExtra::grid.arrange(Hamby_test6$Grooves[[1]]$plot,
                        Hamby_test7$Grooves[[3]]$plot,
                        Hamby_test7$Grooves[[15]]$plot,
                        Hamby_test7$Grooves[[12]]$plot,
                        nrow = 2)

rm(Hamby_test, Hamby_test2, Hamby_test3, Hamby_test4, Hamby_test5, Hamby_test6, Hamby_test7 )

When run in interactive mode, a shiny app can be used to set smarter grooves.

Groove Identifications by: Andrew Maloney

Using Shiny App

library(shiny)
if (file.exists(saved_grooves_location)) {
  hamby$Grooves <- readRDS(saved_grooves_location)
}
if (interactive()) { # only run when you're manually running chunks... don't run when the whole document is compiled.
  shinyApp(
    ui = fluidPage(
      selectInput("k", "Investigate kth plot:",
                  selected = 1,
                  choices = (1:length(hamby$Grooves)) %>% set_names(hamby$id)
      ),
      textOutput("groovelocations"),
      actionButton("confirm", "Confirm"),
      actionButton("save", "Save"),
      plotOutput("groovePlot", click = "plot_click"),
      verbatimTextOutput("info")
    ),
    
    server = function(input, output, session) {
      output$groovePlot <- renderPlot({
        k <- as.numeric(input$k)
        p <- hamby$Grooves[[k]]$plot
        
        p
      })
      output$groovelocations <- renderText({
        paste(
          "Left Groove: ", hamby$Grooves[[as.numeric(input$k)]]$groove[1],
          " Right Groove: ", hamby$Grooves[[as.numeric(input$k)]]$groove[2]
        )
      })
      observeEvent(input$confirm, {
        cat(paste(hamby$id[as.numeric(input$k)], "\n"))
        updateSelectInput(session, "k", "Investigate kth plot:",
                          selected = as.numeric(input$k) + 1,
                          choices = (1:length(hamby$Grooves)) %>% set_names(hamby$id)
        )
      })
      observeEvent(input$save, {
        saveRDS(hamby$Grooves, file = saved_grooves_location)
        message("groove data saved\n")
      })
      
      observeEvent(input$plot_click, {
        k <- as.numeric(input$k)
        xloc <- input$plot_click$x
        
        gr <- hamby$Grooves[[k]]$groove
        if (abs(gr[1] - xloc) < abs(gr[2] - xloc)) {
          hamby$Grooves[[k]]$groove[1] <<- xloc
        } else {
          hamby$Grooves[[k]]$groove[2] <<- xloc
        }
        output$groovePlot <- renderPlot({
          k <- as.numeric(input$k)
          p <- hamby$Grooves[[k]]$plot +
            geom_vline(xintercept = hamby$Grooves[[k]]$groove[1], colour = "green") +
            geom_vline(xintercept = hamby$Grooves[[k]]$groove[2], colour = "green")
          
          p
        })
      })
      output$info <- renderText({
        paste0("x=", input$plot_click$x, "\ny=", input$plot_click$y)
      })
    },
    options = list(height = 500)
  )
  saveRDS(hamby$Grooves, file = saved_grooves_location)
} else {
  if (!file.exists(saved_grooves_location)) {
    message("run script in interactive mode to fix grooves")
  } else {
    hamby$Grooves <- readRDS(saved_grooves_location)
  }
}


#Test Some Grooves(performed:06/12/19)
#Signatures
hamby <- hamby %>% 
  mutate(Signatures = future_map2(.x = CrossCut, .y = Grooves, .f = cc_get_signature, span = 0.75, span2 = .03)) 

Signatures <- hamby %>% 
  select(id, Set, Barrel, Bullet, Land, Signatures) %>% 
    tidyr::unnest()
#Looking for major issues - grooves not set correctly (large deviations at beginning or end of a line)

Signatures %>%
  filter(Barrel != "Unknown") %>% 
ggplot(data = ., aes(x = x, y = sig, color = Land)) + 
  geom_line()+
  facet_grid(Set + paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel)) # put the barrels in the right order
## Warning: Removed 2733 rows containing missing values (geom_path).

Signatures %>%
  filter(Set == "Hamby173") %>%
  filter(Barrel == "Unknown") %>% 
ggplot(data = ., aes(x = x, y = sig, color = Land)) + 
  geom_line()+
  facet_wrap(paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel), ncol = 4)
## Warning: Removed 1682 rows containing missing values (geom_path).

Signatures %>%
  filter(Set == "Hamby252") %>%
  filter(Barrel == "Unknown") %>% 
ggplot(data = ., aes(x = x, y = sig, color = Land)) + 
  geom_line()+
  facet_wrap(paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel), ncol = 4)
## Warning: Removed 1550 rows containing missing values (geom_path).

load("Data_NeededV3.RData")

Examining the Signature Profiles we can see for barrels 1-10, for both Hamby173 + Hamby252, that the signatures for lands 1-6 are aligned pretty well. There seems to be some noticeable deviations at the beginning and end for Barrels 1-10 in both sets. I looked at the groove to groove scans using FIX3P and these spikes at the beginning and end for Barrels 1 and 10 can be identified. Groove locations seem to be set correctly for these lands.

There is some noticeable missing data for Signature comparisons [Hamby252/Bullet1] when filling by Bullet. This is hard to see when filling by the Lands.

We also examine the number of consecutively matching peaks in signatures for two aligned lands.(Case Validation Paper)

Groove Identifications Approved

comparisons <- crossing(Bullet1 = hamby$id, Bullet2 = hamby$id) %>%
  left_join(nest(hamby, -id) %>% magrittr::set_names(c("Bullet1", "Bullet1_data"))) %>%
  left_join(nest(hamby, -id) %>% magrittr::set_names(c("Bullet2", "Bullet2_data"))) %>%
  mutate(Set1 = str_extract(Bullet1, "Hamby\\d{2,3}"),
         Set2 = str_extract(Bullet2, "Hamby\\d{2,3}")) %>%
  filter(Set1 == Set2) %>% # Get rid of cross-set comparisons for now...
  select(-matches("Set"))
#plan(multicore(workers = availableCores(constraints = 8)))

get_sig <- function(data) {
  purrr::map(data$Signatures, "sig")
}
comparisons <- comparisons %>%
  mutate(sig1 = purrr::map(Bullet1_data, get_sig), sig2 = purrr::map(Bullet2_data, get_sig))

plan(multicore)

comparisons <- comparisons %>%
  mutate(Aligned = future_map2(sig1, sig2, ~sig_align(unlist(.x), unlist(.y)))) # Getting Aligned signatures

# Get striae
comparisons <- comparisons %>%
  mutate(Striae = future_map(Aligned, sig_cms_max)) # Obtaining Striae

saveRDS(select(comparisons, -Bullet1_data, -Bullet2_data), file = "Hamby_173_252_Comparisons.rda")

Extracting features

comparisons <- comparisons %>% 
  select(-Bullet1_data, -Bullet2_data)

plan(multicore(workers = availableCores(constraints = 8))) # Core Restraint so server does not die 

comparisons <- comparisons %>% 
  mutate(features = future_map2(.x = Aligned, .y = Striae, .f = extract_features_all, resolution = 1.5625)) #ObtainingFeatures

comparisons <- comparisons %>% 
  mutate(Legacy_Features = future_map(Striae, extract_features_all_legacy, resolution = 1.5625)) # Obtaining feature leacy

comparisons <- comparisons %>% 
  tidyr::unnest(Legacy_Features) # Extracting feature legacy

Creating Columns for organization and for future plotting purposes.

comparisons <- comparisons %>% 
  mutate(Set = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\1", Bullet2)) # Creating columns from IDs
comparisons <- comparisons %>% 
  mutate(BarrelA = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\2", Bullet2))
comparisons <- comparisons %>% 
  mutate(BarrelB = gsub("(Hamby173|Hamby252)-([0-9]{0,2}|Unknown)-([1-2A-Z])-([1-6])", "\\2", Bullet1))
comparisons <- comparisons %>% 
  mutate(BulletA = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\3", Bullet2))
comparisons <- comparisons %>% 
  mutate(BulletB = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\3", Bullet1))
comparisons <- comparisons %>% 
  mutate(LandA = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\4", Bullet2))
comparisons <- comparisons %>% 
  mutate(LandB = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\4", Bullet1))
head(comparisons, 500)
## # A tibble: 500 x 21
##    Bullet1 Bullet2   ccf rough_cor    lag       D    sd_D signature_length
##    <chr>   <chr>   <dbl>     <dbl>  <dbl>   <dbl>   <dbl>            <dbl>
##  1 Hamby1… Hamby1… 1         1      0     0       0.00489             1.64
##  2 Hamby1… Hamby1… 0.350     0.350  0.116 0.00259 0.00479             2.02
##  3 Hamby1… Hamby1… 0.519     0.519 -0.164 0.00186 0.00348             1.88
##  4 Hamby1… Hamby1… 0.220     0.220  0.146 0.00295 0.00478             2.09
##  5 Hamby1… Hamby1… 0.292     0.292  0.035 0.00250 0.00442             1.80
##  6 Hamby1… Hamby1… 0.248     0.248  0     0.00284 0.00458             1.78
##  7 Hamby1… Hamby1… 0.248     0.248 -0.088 0.00222 0.00377             1.76
##  8 Hamby1… Hamby1… 0.287     0.287  0.115 0.00233 0.00425             1.94
##  9 Hamby1… Hamby1… 0.258     0.258 -0.113 0.00438 0.00683             1.66
## 10 Hamby1… Hamby1… 0.271     0.271  0.159 0.00246 0.00370             1.89
## # … with 490 more rows, and 13 more variables: overlap <dbl>,
## #   matches <dbl>, mismatches <dbl>, cms <dbl>, non_cms <dbl>,
## #   sum_peaks <dbl>, Set <chr>, BarrelA <chr>, BarrelB <chr>,
## #   BulletA <chr>, BulletB <chr>, LandA <chr>, LandB <chr>

Features 2017 Data

features_2017 <- read_csv("features-hamby.csv") #Reading in csv
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   match = col_logical(),
##   study1 = col_character(),
##   study2 = col_character(),
##   barrel2 = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 31240 parsing failures.
##   row     col expected actual                 file
## 90507 barrel1 a double      A 'features-hamby.csv'
## 90508 barrel1 a double      A 'features-hamby.csv'
## 90509 barrel1 a double      A 'features-hamby.csv'
## 90510 barrel1 a double      A 'features-hamby.csv'
## 90511 barrel1 a double      A 'features-hamby.csv'
## ..... ....... ........ ...... ....................
## See problems(...) for more details.
features_2017 <- features_2017 %>%
  select(-land1_id, -land2_id) %>% # removing
    filter(study1 != "Cary" & study2 != "Cary") %>% # removing
      rename(BarrelB = barrel1, BulletB = bullet1, LandB = land1) %>% # Changed column names
      rename(BarrelA = barrel2, BulletA = bullet2, LandA = land2) %>% # Changed column names
        select(-study1) %>%
      rename(Set = study2) %>%
          mutate(Set = gsub("Hamby44", "Hamby173", Set)) %>% #Chnaging observation name
            mutate(Set = factor(Set, levels = c("Hamby173", "Hamby252"))) %>% # for ordering principles
      rename(ccf2 = ccf, rough_cor2 = rough_cor, lag2 = lag, D2 = D, sd_D2 = sd_D, signature_length2 = signature_length, overlap2 = overlap, matches2 = matches, mismatches2 = mismatches, cms2x = cms, non_cms2 = non_cms, sum_peaks2 = sum_peaks) # Column names changed for comparing purposes 


#Exploring we see that all lettered Barrels only have bullet equal to 1. No need to worry about a lettered barrel having a bullet 2
# Code will not look like this forever... Duct Tape... Will Dplyr soon...

features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "A"] <- "A") #Assign A to Bullets with column "BarrelA" equalequal to "A"
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "B"] <- "B")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "C"] <- "C")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "D"] <- "D")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "E"] <- "E")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "F"] <- "F")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "G"] <- "G")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "H"] <- "H")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "I"] <- "I")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "J"] <- "J")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "L"] <- "L")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "M"] <- "M")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "N"] <- "N")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "Q"] <- "Q")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "R"] <- "R")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "S"] <- "S")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "U"] <- "U")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "V"] <- "V")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "W"] <- "W")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "X"] <- "X")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "Y"] <- "Y")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "Z"] <- "Z")



features_2017$BarrelA[features_2017$BarrelA == "A"] <- "Unknown" #Group lettered Barrels together into Barrel "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "B"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "C"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "D"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "E"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "F"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "G"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "H"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "I"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "J"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "L"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "M"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "N"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "Q"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "R"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "S"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "U"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "V"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "W"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "X"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "Y"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "Z"] <- "Unknown"

features_2017 <- features_2017 %>% 
  mutate(Bullet1 = paste(Set, BarrelB, BulletB, LandB, sep = "-"),
         Bullet2 = paste(Set, BarrelA, BulletA, LandA, sep = "-")) # Creating ID similar to Heike's 


features_2017 <- features_2017[order(features_2017$Set), ] #Ordered Set column so that all 173 observations came before 252 observations
                                                          # At first glance Hamby173 Barrel 10, Bullet 1, Land 1 seems to be missing

features_2017 <- features_2017 %>% 
  mutate(BarrelB = as.character(BarrelB), 
         BulletA = as.character(BulletA), 
         BulletB = as.character(BulletB), 
         LandA = as.character(LandA), 
         LandB = as.character(LandB))

Comparing 2017 and 2019 after basic cleaning

table(comparisons$BarrelA)
## 
##       1      10       2       3       4       5       6       7       8 
##    5040    5040    5040    5040    5040    5040    5040    5040    5040 
##       9 Unknown 
##    5040   37800
table(comparisons$BulletA)
## 
##     1     2     A     B     C     D     E     F     G     H     I     J 
## 25200 25200  1260  2520  2520  1260  2520  1260  1260  2520  1260  1260 
##     L     M     N     Q     R     S     U     V     W     X     Y     Z 
##  2520  2520  1260  1260  1260  1260  2520  1260  1260  2520  1260  1260
table(comparisons$LandA)
## 
##     1     2     3     4     5     6 
## 14700 14700 14700 14700 14700 14700
#----------------------------------------------------------------#

table(features_2017$BarrelA)
## 
##       1      10       2       3       4       5       6       7       8 
##     852     276    1428    1909    2438    3108    3519    4236    4812 
##       9 Unknown 
##    5152   56936
table(features_2017$BulletA)
## 
##     1     2     A     B     C     D     E     F     G     H     I     J 
## 13570 14160  1431  3102  3498  2031  3606  2103  1575  3750  1647  2175 
##     L     M     N     Q     R     S     U     V     W     X     Y     Z 
##  3894  3677  1749  1900  1785  2313  3776  1857  1893  4308  2415  2451
table(features_2017$LandA)
## 
##     1     2     3     4     5     6 
## 13927 13781 14401 13411 14538 14608

Heat Maps

library(viridis)
## Loading required package: viridisLite
comparisons %>%                          #Comparison heatmap for 2019 data
  filter(!is.na(BarrelA)) %>% 
    filter(!is.na(BarrelB)) %>% 
      group_by(BarrelA, BarrelB, Set) %>% 
        summarise(count = n()) %>% 
  ggplot(aes(x = BarrelA, y = BarrelB))+
    geom_tile(aes(fill = count))+
      scale_fill_viridis(option = "plasma", direction = -1)+ 
        geom_text(aes(label = count))+ 
          ggtitle("Comparison Tiles")+ 
            theme_bw()+ 
              facet_grid(~Set) #Good way to compare graphs below.  Shows how bad "features-hamby.csv" really is. 

comparisons %>% 
  filter(!is.na(BarrelA)) %>% 
    filter(!is.na(BarrelB)) %>% 
      group_by(BarrelA, BarrelB, Set) %>% summarise(count = max(signature_length)) %>% 
  ggplot(aes(x = BarrelA, y = BarrelB))+
    geom_tile(aes(fill = count))+
      scale_fill_viridis(option = "plasma", direction = -1)+ 
        geom_text(aes(label = count))+ 
          ggtitle("Comparison Signature_Length Tiles")+ 
            theme_bw()+
              facet_grid(~Set)

features_2017 %>%                        # features_2017 plot 1
  filter(!is.na(BarrelA)) %>% 
    filter(!is.na(BarrelB)) %>% 
      group_by(BarrelA, BarrelB, Set) %>% 
        summarise(count = n()) %>% 
  ggplot(aes(x = BarrelA, y = BarrelB))+
    geom_tile(aes(fill = count))+
      scale_fill_viridis(option = "plasma", direction = -1)+ 
        geom_text(aes(label = count))+ 
          ggtitle("Comparison Count Version 1")+ 
            theme_bw()+ theme(axis.text.x = element_text(angle = 10))+
              facet_grid(~Set) #Very Promising Results.  This geom_tile/heat_map shows my analysis in visual format.

features_2017 %>%                       # features_2017 plot 2
 filter(!is.na(BarrelA)) %>%
   filter(!is.na(BarrelB)) %>%
     group_by(BarrelA, BarrelB, Bullet1, Bullet2, Set) %>%
      arrange(desc(signature_length2)) %>% 
       filter(row_number() == 1) %>% 
                ungroup() %>% 
  group_by(BarrelA, BarrelB, Set) %>%
    summarise(count = n()) %>% 
      ggplot(aes(x = BarrelA, y = BarrelB))+
       geom_tile(aes(fill = count))+
        scale_fill_viridis(option = "plasma", direction = -1)+
          geom_text(aes(label = count))+
            ggtitle("Comparison Count Version 2")+
              theme_bw()+ theme(axis.text.x = element_text(angle = 10))+
                 facet_grid(~Set)

We shall use the comparison plot from our 2019 data to help compare our data from 2017. Looking at features_2017 plot1 we can see that my analysis about duplicate data seems to be correct. Looking at features_2017 plot2 we can see that there is also missing data on the Barrel-Bullet-Land level. Both of these visual representations are also written in the word.document

Finding Missing Values

# Let's try to find those missing values with an easy way
# Inspiration came from Susan and 
#https://www.r-bloggers.com/r-sorting-a-data-frame-by-the-contents-of-a-column/
#https://www.rdocumentation.org/packages/base/versions/3.6.0/topics/t 

comparisons_check <- comparisons %>% 
  select(Bullet1, Bullet2) #Remove Columns we wont be using

features_2017_check <- features_2017 %>% 
  select(Bullet1, Bullet2) # Remove Columns we wont be using

finder <- t(apply(comparisons_check,1,sort))
finder <- data.frame(finder)

#Take Transpose and Sort in order... Now we have:
#AB   => AB
#BA   => AB 

finder <- finder[!duplicated(finder),] # Remove duplicates 

finder <- finder %>% 
  rename(Bullet2 = X2, Bullet1 = X1) %>% 
    select(Bullet1, Bullet2) 
      
finder <- anti_join(finder, features_2017_check, by = c("Bullet2")) # Shows what data is missing and if you add Bullet1 to the anti_join you can see all thee missing Unknown|Unknown Comparisons 
## Warning: Column `Bullet2` joining factor and character vector, coercing
## into character vector

Creating Data Frames for feature comparisons

#Better version of above method I think, using dplyr
comparison_split <- comparisons %>%                                                    #Assign Comparisons
  rowwise() %>%                                                                        #Every Row
    mutate(sorter = paste(sort(c(Bullet1, Bullet2)), collapse="-")) %>%                # create col "sorter", which pastes both cols and sorts in order
      distinct(sorter, .keep_all=T) %>%                                                # remove all rows with duplicate values in sorter
        select(-sorter)                                                                # no need for column 
 


feature_id1 <- features_2017[!duplicated(features_2017[c("Bullet1","Bullet2")]),] #profile1
feature_id2 <- anti_join(features_2017, feature_id1)                              #profile2
## Joining, by = c("compare_id", "profile1_id", "profile2_id", "ccf2", "rough_cor2", "lag2", "D2", "sd_D2", "signature_length2", "overlap2", "matches2", "mismatches2", "cms2x", "non_cms2", "sum_peaks2", "match", "BarrelB", "BulletB", "LandB", "Set", "BarrelA", "BulletA", "LandA", "Bullet1", "Bullet2")
feature_id1 <- na.omit(feature_id1) # Remove Na
feature_id2 <- na.omit(feature_id2) # Remove Na

scatter1 <- inner_join(comparison_split, feature_id1, by = c("Bullet1", "Bullet2")) #ScatterPlot Comparisons for profile_id version 1
scatter2 <- inner_join(comparison_split, feature_id2, by = c("Bullet1", "Bullet2")) # ScatterPlot Comparisons for Profile_id version 2

ScatterPlots

scatter1 %>% ggplot(aes(x = ccf, y = ccf2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("ccf Vs ccf2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter2 %>% ggplot(aes(x = ccf, y = ccf2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+        
        ggtitle("ScatterPlot2 ccf Vs ccf2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter1 %>% ggplot(aes(x = rough_cor, y = rough_cor2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("rough_cor Vs rough_cor2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter2 %>% ggplot(aes(x = rough_cor, y = rough_cor2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("ScatterPlot2 rough_cor Vs rough_cor2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter1 %>% ggplot(aes(x = cms, y = cms2x))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("cms Vs cms2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter2 %>% ggplot(aes(x = cms, y = cms2x))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("ScatterPlot2 cms Vs cms2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter1 %>% ggplot(aes(x = sum_peaks, y = sum_peaks2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("sum_peaks Vs sum_peaks2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter2 %>% ggplot(aes(x = sum_peaks, y = sum_peaks2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("ScatterPlot2 sum_peaks Vs sum_peaks2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter1 %>% ggplot(aes(x = signature_length, y = signature_length2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("signature_length Vs signature_length2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter2 %>% ggplot(aes(x = signature_length, y = signature_length2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("ScatterPlot2 signature_length Vs signature_length2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter1 %>% ggplot(aes(x = D, y = D2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("Distance Vs Distance2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatter2 %>% ggplot(aes(x = D, y = D2))+
  geom_bin2d(bins = 65)+
  geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        theme_bw()+
        ggtitle("ScatterPlot2 Distance Vs Distance2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Scatter Plots

scatterA <- gather(scatter1, key="measure", value="value", c("cms", "non_cms", "matches", "mismatches", "D", "sum_peaks", "signature_length", "ccf2", "D2", "signature_length2", "matches2", "mismatches2", "cms2x", "non_cms2", "sum_peaks2")) 

scatterB <- gather(scatter1, key="measure", value="value", c("cms", "non_cms", "matches", "mismatches", "D", "sum_peaks", "signature_length", "ccf", "D2", "signature_length2", "matches2", "mismatches2", "cms2x", "non_cms2", "sum_peaks2")) 


scatterA %>% 
    ggplot(aes(x = ccf, y = value))+
      geom_point()+
      geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        facet_wrap(~measure, ncol = 4, scale = "free")+ scale_x_log10()+
           theme_bw()+
            ggtitle("ccf VS other features")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

scatterB %>% 
    ggplot(aes(x = ccf2, y = value))+
      geom_point()+
      geom_smooth(method = lm)+
      geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        facet_wrap(~measure, ncol = 4, scale = "free")+
          theme_bw()+
            ggtitle("ccf2 VS other features")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#scatterA %>% 
    #ggplot(aes(x = cms, y = value))+
      #geom_point()+
      #geom_smooth(method = lm)+
      #geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        #facet_wrap(~measure, ncol = 4, scale = "free")+
           #theme_bw()+ 
            #ggtitle("cms VS other features")
#scatterB %>% 
    #ggplot(aes(x = cms2x, y = value))+
      #geom_point()+
      #geom_smooth(method = lm)+
      #geom_smooth(aes(colour = "red"), show.legend = FALSE)+
        #facet_wrap(~measure, ncol = 4, scale = "free")+
         #theme_bw()+
            #ggtitle("cms2x VS other features")

#---------------------------------------------------------------------------------------#
#CHECK 

Looking at the normal/facet_grid scatterplots we can make some observations but before that, let me go over some details. Two geom_smooth() layers were added. The blue line represents a lm method and the red line represents the gam method. Looking at the graphs where each feature is compared with its other version, we see there is no linear relationship between x and y. For the graphs where ccf, ccf2, cms, and cms2 are compared to the other features, we also see no linear relationships between x and y. The scatterplots look like huge clouds.